Load packages
library(tidyverse) ## Always load tidyverse
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.0 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(tidytext) ## Needed for text analysis
library(rstudioapi) ## set working directory
library(knitr) ## print nice tables
library(plotly) ## Interactive plots
##
## Attaching package: 'plotly'
##
## The following object is masked from 'package:ggplot2':
##
## last_plot
##
## The following object is masked from 'package:stats':
##
## filter
##
## The following object is masked from 'package:graphics':
##
## layout
library(RColorBrewer) ## Color palettes
Environment prep
## Suppress messages
knitr::opts_chunk$set(warning = FALSE, message = FALSE)
## Set directory to wherever this script lives
setwd(dirname(getActiveDocumentContext()$path))
Load overall product info
## List csv files
utils::unzip("sephora_data.zip", list = TRUE)$Name
## [1] "product_info.csv" "reviews_0-250.csv" "reviews_1250-end.csv"
## [4] "reviews_250-500.csv" "reviews_500-750.csv" "reviews_750-1250.csv"
## Load product info
product_info <- read_csv(unz("sephora_data.zip", "product_info.csv")) %>%
select(product_id, reviews, primary_category, secondary_category, tertiary_category) %>%
mutate(category = ifelse(is.na(tertiary_category) == TRUE, secondary_category,
tertiary_category))
Which categories have the most reviews?
product_info %>%
group_by(category) %>%
summarize(reviews = sum(reviews, na.rm = TRUE)) %>%
arrange(desc(reviews)) %>%
head(10) %>%
kable()
| Foundation |
224222 |
| Mascara |
214671 |
| Moisturizers |
205380 |
| Perfume |
188490 |
| Face Serums |
173771 |
| Eyeliner |
123261 |
| Face Wash & Cleansers |
121357 |
| Makeup |
114262 |
| Eyebrow |
113709 |
| Mini Size |
106088 |
Load individual reviews
reviews <- utils::unzip("sephora_data.zip", list = TRUE)$Name %>%
str_subset("^reviews_.*\\.csv$") %>%
map_dfr(~ read_csv(unz("sephora_data.zip", .x),
show_col_types = FALSE,
col_select = c(product_id, product_name, brand_name,
submission_time, rating, is_recommended,
review_title, review_text))) %>%
mutate(year = as.numeric(substr(submission_time, 1, 4))) %>%
distinct() %>%
inner_join(product_info, by = "product_id")
Recount categories with highest number of reviews
reviews %>%
group_by(category) %>%
summarize(reviews = n()) %>%
arrange(desc(reviews)) %>%
head(10) %>%
kable()
| Moisturizers |
206032 |
| Face Serums |
174482 |
| Face Wash & Cleansers |
121672 |
| Mini Size |
85465 |
| Eye Creams & Treatments |
70418 |
| Face Masks |
66818 |
| Lip Balms & Treatments |
61602 |
| Face Oils |
41844 |
| Face Sunscreen |
39079 |
| Toners |
35846 |
Make files for type
## Pick subtype
subtype_reviews <- reviews %>% filter(category == "Moisturizers")
## Merge the title of the review with the actual review, and unnest tokens
tokens <- subtype_reviews %>%
unite("review_title_and_test", review_title, review_text, sep = " ") %>%
unnest_tokens(word, review_title_and_test)
## Count use of each token, and average across ratings/recommendation given
token_counts <- tokens %>%
group_by(word) %>%
summarise(n = n(),
average_rating = mean(rating, na.rm=TRUE),
average_recommendation = mean(is_recommended, na.rm=TRUE)) %>%
arrange(-n) %>%
filter(n>200)
## Group word use by year
token_counts_year <- tokens %>%
group_by(word, year) %>%
summarise(n = n(),
average_rating = mean(rating, na.rm = TRUE),
average_recommendation = mean(is_recommended, na.rm = TRUE)) %>%
arrange(-n)%>%
filter(n>20) %>% ## keep words used more than 20x per year
group_by(word) %>%
filter(n() > 10) %>% ## keep words that had substantial use more than half of the years
ungroup()
Look at distribution of word counts
token_counts %>%
ggplot(aes(n)) +
geom_histogram(fill="cornflowerblue") +
geom_vline(aes(xintercept = mean(n, na.rm=TRUE)), lty = 2) +
geom_vline(aes(xintercept = median(n, na.rm=TRUE))) +
scale_x_log10(labels = function(x) format(x, scientific = FALSE)) +
labs(title = "Word Counts", subtitle = "solid line = median; dotted line = mean")+
theme_classic()

Look at how key words are distributed accross average rating
Good sanity check to see if desirable words float to the top,
etc.
token_counts %>%
anti_join(get_stopwords()) %>%
filter(str_detect(word, "[A-Za-z]")) %>%
filter(nchar(word) > 1) %>%
filter(n > 100) %>%
ggplot(aes(n, average_rating)) +
geom_text(aes(label = word), check_overlap = TRUE, show.legend = FALSE, vjust = "top", hjust = "left") +
scale_x_log10(labels = function(x) format(x, scientific = FALSE)) +
theme_classic()

Run linear models and extract results
linear_slopes <- token_counts_year %>%
group_by(word) %>%
do(model = lm(average_rating ~ year, data = .)) %>%
filter(!is.null(model)) %>%
summarize(word = word[1],
slope = coef(model)[2],
intercept = coef(model)[1],
significance = ifelse(nrow(summary(model)$coefficients) >= 2 &
ncol(summary(model)$coefficients) >= 4,
summary(model)$coefficients[2, 4], NA)) %>%
ungroup()
Select words with highest and lowest linear coefficients
In other words, which keywords have the largest decline or increase
in associated rating?
declining_words <- linear_slopes %>%
arrange(slope) %>%
head(n = 5) %>%
select(word, slope)
ascending_words <- linear_slopes %>%
arrange(slope) %>%
tail(n = 5) %>%
select(word, slope)
shifting_words <- rbind(declining_words, ascending_words) %>%
arrange(-slope)
Time to plot!
## Set up some nice colors
colour_map <- set_names(rev(colorRampPalette(brewer.pal(10, "RdYlGn"))(10)),
shifting_words$word)
## Make sure plot legend stays in order of coefs (not alphabetical)
word_order <- shifting_words$word
## Set up plot
token_trajectories <- shifting_words %>%
inner_join(token_counts_year) %>%
mutate(word = factor(word, levels = word_order)) %>%
ggplot(aes(x = year, y = average_rating, group = word, color = word)) +
geom_smooth(method = "lm", se = FALSE) +
scale_color_manual(values = colour_map) +
scale_x_continuous(breaks = c(2008,2010,2012,2014,2016,2018,2020,2022))+
scale_y_continuous(limits = c(1,5),
breaks = c(1,2,3,4,5))+
labs(x = "Year",
y = "Average Rating",
color = "Keyword",
title = "Trends in Ratings of Moisturizers") +
theme_bw()
## Make interactive
ggplotly(token_trajectories)
Let’s take a closer look at one of our keywords
subtype_reviews %>%
filter(year >= 2022 &
rating == 5 &
str_detect(review_text, "packaging")) %>%
sample_n(3) %>%
select(review_text) %>%
kable()
| This moisturizer is incredible for dry skin! I’ve
always had super dry skin and live in a very cold dry place which
doesn’t help and this product has been incredible! I use this at night
after cleansing and using the hyaluronic acid serum from the same
brand/line and they work incredibly together to hydrate my skin! I
absolutely love these products and as a bonus I love the green
packaging! |
| Very luxe, treat yourself! This crème fondante is as
luxurious as the name sounds. It’s thick, hydrating, soft, buttery and
absorbs into your skin with no greasy after effect. It really feels
amazing on your skin and super hydrating. It has a soft scent that seems
to dissipate quickly. While this may seem pricey, if you are looking for
something to hydrate, firm, give a glow, and plump your skin this is it!
The packaging is very luxe too. I am completely pleased with this crème.
I received this product from Bzzagent for an honest review and I feel
this is honest. If you want a luxurious crème to treat yourself and
splurge, a little goes a long way, get this! |
| First of all, beautiful packaging! The moisturizer
leaves my skin feeling moisturized after application while also leaving
a slight glow. The texture of the product is a bit thick hence why I use
it for my night time skin care routine. Overall, I love this
product. |